The goal of this project is to perform keyword network analysis and word frequency analysis to draw insights from data

#Importing the libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(janeaustenr)
library(ggplot2)
library(tidyr)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Task 1

The objective of this task is to build an adjacency matrix for the keywords in an article and convert the adjacency matrix to a weighted network. After that, we will compute the strength and degree of the network and show the top keywords by strength and degree. Then, we will find out the top keyword pairs by finding top weighted edges. Finally, by plotting the average strength and degree, we will analyze the network.

Computing keyword co-occurrence matrix

Keyword_data <- read.csv("/Users/bardia/Desktop/Keyword_data.csv", na.strings = "")

# Stack all variables to find unique
s<-stack(Keyword_data)
# Calculate unique keywords
u<-unique(s$values)
# Create a weighted adjacency matrix
answer<-matrix(0, nrow=length(u), ncol=length(u))
colnames(answer)<-u
rownames(answer)<-u
# Logic to create weighted matrix
for(i in 1:length(Keyword_data$Keyword.2)){
  temp<-unlist(Keyword_data[i,])
  temp<-temp[!is.na(temp)]
  keyword_list<-combn(temp,2)
  for(j in 1:length(keyword_list[1,])){
    rowind<-which(rownames(answer)==(keyword_list[1,j]))
    colind<-which(colnames(answer)==(keyword_list[2,j]))
    answer[rowind,colind]<-answer[rowind,colind]+1
    answer[colind,rowind]<-answer[colind,rowind]+1
  }

  
}

Converting the adjacency matrix to a weighted network and computing the degree and strength

#creating the network from adjacency matrix
ad_mat <- answer
network <- graph_from_adjacency_matrix(ad_mat, mode="undirected", weighted=TRUE)

#edge_attr(network)
#vertex_attr(network)

#degree and strength of the netwrok
deg <- degree(network)
strength <- strength(network)

Top 10 nodes by degree and strength

#Top ten nodes by degree and strength
top_deg <- as.data.frame(sort(deg, decreasing = TRUE)[1:10])
colnames(top_deg) <- "top degree nodes"
print(top_deg)
##                              top degree nodes
## ORGANIZATIONAL behavior                   166
## ORGANIZATIONAL effectiveness              104
## MANAGEMENT science                        102
## PERSONNEL management                       93
## DECISION making                            90
## ORGANIZATIONAL structure                   74
## ORGANIZATIONAL sociology                   66
## STRATEGIC planning                         66
## INDUSTRIAL management                      64
## CORPORATE governance                       62
top_strength <- as.data.frame(sort(strength, decreasing = TRUE)[1:10])
colnames(top_strength) <- "top strength nodes"
print(top_strength)
##                              top strength nodes
## ORGANIZATIONAL behavior                     265
## ORGANIZATIONAL effectiveness                144
## MANAGEMENT science                          136
## PERSONNEL management                        126
## DECISION making                             112
## ORGANIZATIONAL structure                    107
## ORGANIZATIONAL sociology                     96
## CORPORATE governance                         85
## INDUSTRIAL management                        84
## STRATEGIC planning                           80

Computing top 10 pairs of keyword (top 10 edges)

#FINDING TOP TEN EDGES BY WEIGHT
min_w <- min(sort(E(network)$weight, decreasing = TRUE)[1:10])
top_edges <-  ends(network, E(network)[E(network)$weight >= min_w], names = TRUE)
top_edges <- as.data.frame(top_edges)
for(i in 1:dim(top_edges)[1]){
  top_edges[i,3] <- E(network)$weight[get.edge.ids(network,c(top_edges[i,1],top_edges[i,2]))]
}
colnames(top_edges) <- c("node1", "node2", "weight")
top_edges <- top_edges %>% arrange(desc(weight))
print(top_edges[1:10,])
##                           node1                        node2 weight
## 1       ORGANIZATIONAL behavior ORGANIZATIONAL effectiveness     11
## 2       ORGANIZATIONAL behavior     ORGANIZATIONAL structure      9
## 3          PERSONNEL management      ORGANIZATIONAL behavior      8
## 4            MANAGEMENT science      ORGANIZATIONAL behavior      7
## 5               DECISION making      ORGANIZATIONAL behavior      6
## 6          CORPORATE governance      ORGANIZATIONAL behavior      6
## 7       ORGANIZATIONAL behavior     ORGANIZATIONAL sociology      6
## 8  ORGANIZATIONAL effectiveness     ORGANIZATIONAL structure      6
## 9          INDUSTRIAL relations      ORGANIZATIONAL behavior      5
## 10      ORGANIZATIONAL behavior        ORGANIZATIONAL change      5

Plotting average strength by degree

#plot
plt_df <- data_frame(degree=deg, strength=(strength))
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
plt_df <- plt_df %>% group_by(degree) %>% mutate(count= n())
plt_df <- plt_df %>% group_by(degree) %>% mutate(sum_strength= sum(strength))
plt_df <- plt_df %>% mutate(average_strength= sum_strength/count)

plt_df %>%
  ggplot(aes(degree, average_strength, main="hh")) +
  geom_point() + 
  labs(title = "Average strength by degree", x="k", y="<s>")+
  scale_x_log10() +
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

TASK 2

The objective of this task is to perform word frequency analysis on twitter data (Elon Musk’s tweets) from 2017-2021. First, we will compute the word frequencies for each year with and without stop words. After that we will plot the word frequencies for each year. Then, we will use Zipf’s law to analyze the data by plotting world frequency by rank. The final task is to create bigram network graphs for each year.

#READING FILES

df_2017<-read.csv("/Users/bardia/Desktop/Project 2/Tweeter data/2017.csv")
df_2018<-read.csv("/Users/bardia/Desktop/Project 2/Tweeter data/2018.csv")
df_2019<-read.csv("/Users/bardia/Desktop/Project 2/Tweeter data/2019.csv")
df_2020<-read.csv("/Users/bardia/Desktop/Project 2/Tweeter data/2020.csv")
df_2021<-read.csv("/Users/bardia/Desktop/Project 2/Tweeter data/2021.csv")

2017 twitter data

Word frequency for 2017

data_2017<-df_2017%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)

Excluding stop words

data_2017<-data_2017 %>%
  anti_join(stop_words,by="word")

Excluding irrelevant words to the analysis such as t.co, https, http

irr <- c("t.co","http","https","it’s","don’t","you’re")
data_2017<- data_2017 %>%
  filter(!word %in% irr)

Displaying top 10 words by highest value of word frequency for the year 2017

top_2017 <- head(data_2017,10)
top_2017
##      word   n
## 1   tesla 315
## 2     amp 219
## 3   model 208
## 4  rocket 149
## 5  spacex 127
## 6  launch 112
## 7     car  99
## 8  falcon  99
## 9       3  98
## 10   time  97

Plotting histograms of word frequency for the year 2017

data_2017$total<-sum(data_2017$n)

data_2017<-data_2017%>%
  mutate(rank=row_number(),`term frequency`= n/total)

ggplot(data_2017, aes(`term frequency`, fill = word)) +
  geom_histogram(colour="Blue",show.legend = FALSE, bins = 30) +
  xlim(NA, 0.0009) 
## Warning: Removed 117 rows containing non-finite values (stat_bin).
## Warning: Removed 8073 rows containing missing values (geom_bar).

Using Zipf’s law and plotting log-log plots of word frequencies and rank for the year 2017

df1<-df_2017%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)%>%
  anti_join(stop_words,by="word")

df1 <- df1 %>% filter(!word %in% irr)

df1$total<-sum(df1$n)
frequency_by_rank <- df1%>%
  mutate(rank=row_number(),`term frequency`= n/total)
frequency_by_rank %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_line(size = 1.1, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset <- frequency_by_rank %>% 
  filter(rank < 2000,
         rank > 0)
frequency_by_rank %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_abline(intercept = -1.5015, slope = -0.7532, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

Bigram Network graphs for the year 2017

df_2017_bigrams <-df_2017 %>%
  unnest_tokens(bigram, tweet, token = "ngrams", n = 2)
df_2017_bigrams<-df_2017_bigrams %>%
  count(bigram,sort=TRUE)
separated_bigrams_2017 <-df_2017_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
filtered_bigrams_2017 <- separated_bigrams_2017 %>%
  filter(!word1 %in% c(stop_words$word,irr)) %>%
  filter(!word2 %in% c(stop_words$word,irr))

bigram_graph_2017 <- filtered_bigrams_2017 %>%
    filter(n>5)%>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
set.seed(2017)

ggraph(bigram_graph_2017, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2017)

df_a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph_2017, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = df_a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 4) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  labs(title = "2017", edge_width="weight") +
  theme_void()

2018

Word frequency for 2018

data_2018<-df_2018%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)

Excluding stop words for the year 2018

data_2018<-data_2018%>%
  anti_join(stop_words,by="word")

Excluding irrelevant words to the analysis such as t.co, https, http

data_2018<- data_2018 %>%
  filter(!word %in% irr)

Displaying top 10 words by highest value of word frequency for the year 2018

top_2018 <- head(data_2018,10)
top_2018
##               word   n
## 1              amp 527
## 2            tesla 450
## 3              car 120
## 4                3 112
## 5            model  98
## 6           spacex  89
## 7           people  73
## 8             time  59
## 9  fredericlambert  57
## 10            cars  55

Plotting histograms of word frequency for the year 2018

data_2018$total<-sum(data_2018$n)

data_2018<-data_2018%>%
  mutate(rank=row_number(),`term frequency`= n/total)


ggplot(data_2018, aes(`term frequency`,  fill = word)) +
  geom_histogram(colour="red",show.legend = FALSE, bins = 30) +
  xlim(NA, 0.0009) 
## Warning: Removed 123 rows containing non-finite values (stat_bin).
## Warning: Removed 6627 rows containing missing values (geom_bar).

Using Zipf’s law and plotting log-log plots of word frequencies and rank for the year 2018

df2<-df_2018%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)%>%
  anti_join(stop_words,by="word")

df2 <- df2 %>% filter(!word %in% irr)

df2$total<-sum(df2$n)
frequency_by_rank_df2 <- df2%>%
  mutate(rank=row_number(),`term frequency`= n/total)
frequency_by_rank_df2 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_line(size = 1.1, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset_2018 <- frequency_by_rank_df2 %>% 
  filter(rank < 2000,
         rank > 0)
frequency_by_rank_df2 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_abline(intercept = -1.5015, slope = -0.7538, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

Bigram Network grapths for the year 2018

B_2018_bigrams <-df_2018 %>%
  unnest_tokens(bigram, tweet, token = "ngrams", n = 2)
B_2018_bigrams<-B_2018_bigrams %>%
  count(bigram,sort=TRUE)
separated_bigrams_2018 <-B_2018_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
filtered_bigrams_2018 <- separated_bigrams_2018 %>%
  filter(!word1 %in% c(stop_words$word,irr)) %>%
  filter(!word2 %in% c(stop_words$word,irr))

bigram_graph_2018 <- filtered_bigrams_2018 %>%
    filter(n>6)%>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
set.seed(2018)

ggraph(bigram_graph_2018, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

SHOWING EDGES WITH WEIGHT >= 6 FOR BETTER VISUALIZATION

set.seed(2018)

df_b <- grid::arrow(type = "closed", length = unit(.15, "inches"))

##why df_a
ggraph(bigram_graph_2018, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = df_a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  labs(title = "2018") +
  theme_void()

2019

Word frequency for 2019

data_2019<-df_2019%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)

Excluding stop words for the year 2019

data_2019<-data_2019%>%
  anti_join(stop_words,by="word")

Excluding irrelevant words to the analysis

data_2019<- data_2019 %>%
  filter(!word %in% irr)

Displaying top 10 words by highest value of word frequency for the year 2019

top_2019 <- head(data_2019,10)
top_2019
##              word    n
## 1           tesla 1329
## 2             amp 1218
## 3          spacex  430
## 4           model  373
## 5  erdayastronaut  300
## 6             car  285
## 7               3  283
## 8            time  225
## 9          rocket  209
## 10           cars  199

Plotting histograms of word frequency for the year 2019

data_2019$total<-sum(data_2019$n)

data_2019<-data_2019%>%
  mutate(rank=row_number(),`term frequency`= n/total)

ggplot(data_2019, aes(`term frequency`, fill = word)) +
  geom_histogram(colour="green",show.legend = FALSE, bins = 30) +
  xlim(NA, 0.0009) 
## Warning: Removed 98 rows containing non-finite values (stat_bin).
## Warning: Removed 15142 rows containing missing values (geom_bar).

Using Zipf’s law and plotting log-log plots of word frequencies and rank for the year 2019

df3<-df_2019%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)%>%
  anti_join(stop_words,by="word")

df3 <-df3 %>% filter(!word %in% irr)
df3$total<-sum(df3$n)
frequency_by_rank_df3 <- df3%>%
  mutate(rank=row_number(),`term frequency`= n/total)
frequency_by_rank_df3 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset_2019 <- frequency_by_rank_df3 %>% 
  filter(rank < 2000,
         rank > 0)
frequency_by_rank_df3 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_abline(intercept = -1.5015, slope = -0.7532, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

Bigram Network grapths for the year 2019

C_2019_bigrams <-df_2019 %>%
  unnest_tokens(bigram, tweet, token = "ngrams", n = 2)
C_2019_bigrams<-C_2019_bigrams %>%
  count(bigram,sort=TRUE)
separated_bigrams_2019 <-C_2019_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
filtered_bigrams_2019 <- separated_bigrams_2019 %>%
  filter(!word1 %in% c(stop_words$word,irr)) %>%
  filter(!word2 %in% c(stop_words$word,irr))
bigram_graph_2019 <- filtered_bigrams_2019 %>%
    filter(n>12)%>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
bigram_graph_2019
## IGRAPH 3d49e67 DN-- 63 45 -- 
## + attr: name (v/c), n (e/n)
## + edges from 3d49e67 (vertex names):
##  [1] model          ->3               NA             ->NA             
##  [3] falcon         ->9               falcon         ->heavy          
##  [5] erdayastronaut ->spacex          boring         ->company        
##  [7] tesla          ->model           space          ->station        
##  [9] upper          ->stage           tesla          ->team           
## [11] cape           ->canaveral       climate        ->change         
## [13] electrekco     ->fredericlambert tesla          ->owners         
## [15] orion_sword    ->some1gee        bluemoondance74->orion_sword    
## + ... omitted several edges
set.seed(2019)

ggraph(bigram_graph_2019, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

Visualizing edges with weight > 12 for better visualization

set.seed(2020)

df_c <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph_2019, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = df_a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

2020

Word frequency for 2020

data_2020<-df_2020%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)

Excluding stop words for the year 2020

data_2020<-data_2020%>%
  anti_join(stop_words,by="word")

Excluding irrelevant words to the analysis

data_2020<- data_2020 %>%
  filter(!word %in% irr)

Displaying top 10 words by highest value of word frequency for the year 2020

top_2020 <- head(data_2020,10)
top_2020
##              word    n
## 1             amp 1822
## 2           tesla 1693
## 3          spacex  639
## 4  erdayastronaut  561
## 5         flcnhvy  424
## 6           model  403
## 7               3  333
## 8             car  326
## 9            time  284
## 10         people  253

Plotting histograms of word frequency for the year 2020

data_2020$total<-sum(data_2020$n)

data_2020<-data_2020%>%
  mutate(rank=row_number(),`term frequency`= n/total)
ggplot(data_2020, aes(`term frequency`, fill = word)) +
  geom_histogram(colour="yellow",show.legend = FALSE, bins = 30) +
  xlim(NA, 0.0009) 
## Warning: Removed 99 rows containing non-finite values (stat_bin).
## Warning: Removed 18368 rows containing missing values (geom_bar).

Using Zipf’s law and plotting log-log plots of word frequencies and rank for the year 2020

df4<-df_2020%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)%>%
  anti_join(stop_words,by="word")

df4 <- df4 %>% filter(!word %in% irr)
df4$total<-sum(df4$n)
frequency_by_rank_df4 <- df4%>%
  mutate(rank=row_number(),`term frequency`= n/total)
frequency_by_rank_df4 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset_2020 <- frequency_by_rank_df4 %>% 
  filter(rank < 2000,
         rank > 0)
frequency_by_rank_df4 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_abline(intercept = -1.5015, slope = -0.7532, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

#Bigram Network grapths for the year 2020

D_2020_bigrams <-df_2020 %>%
  unnest_tokens(bigram, tweet, token = "ngrams", n = 2)
D_2020_bigrams<-D_2020_bigrams %>%
  count(bigram,sort=TRUE)
separated_bigrams_2020 <-D_2020_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
filtered_bigrams_2020 <- separated_bigrams_2020 %>%
  filter(!word1 %in% c(stop_words$word,irr)) %>%
  filter(!word2 %in% c(stop_words$word,irr))
bigram_graph_2020<- filtered_bigrams_2020 %>%
    filter(n>=15)%>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
bigram_graph_2020
## IGRAPH 38497e0 DN-- 71 52 -- 
## + attr: name (v/c), n (e/n)
## + edges from 38497e0 (vertex names):
##  [1] NA            ->NA              model         ->3              
##  [3] falcon        ->9               erdayastronaut->spacex         
##  [5] falcon        ->heavy           boring        ->company        
##  [7] tesla         ->model           tesla         ->team           
##  [9] space         ->station         upper         ->stage          
## [11] tesla         ->owners          cape          ->canaveral      
## [13] climate       ->change          static        ->fire           
## [15] super         ->heavy           flcnhvy       ->erdayastronaut 
## + ... omitted several edges
set.seed(2020)

ggraph(bigram_graph_2020, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

Visualizing edges with weight > 15 for better visualization

set.seed(2020)

df_d <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph_2020, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = df_a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

2021

Word frequency for 2021

data_2021<-df_2021%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)

Excluding stop words for the year 2021

data_2021<-data_2021%>%
  anti_join(stop_words,by="word")

Excluding irrelevant words to the analysis

data_2021<- data_2021 %>%
  filter(!word %in% irr)

Displaying top 10 words by highest value of word frequency for the year 2021

top_2021 <- head(data_2021,10)
top_2021
##              word    n
## 1             amp 1927
## 2           tesla 1733
## 3          spacex  696
## 4  erdayastronaut  606
## 5         flcnhvy  442
## 6           model  404
## 7               3  339
## 8             car  334
## 9            time  302
## 10       ppathole  268

Plotting histograms of word frequency for the year 2021

data_2021$total<-sum(data_2021$n)

data_2021<-data_2021%>%
  mutate(rank=row_number(),`term frequency`= n/total)


ggplot(data_2021, aes(`term frequency`, fill = word)) +
  geom_histogram(colour="Black",show.legend = FALSE, bins = 30) +
  xlim(NA, 0.0009) 
## Warning: Removed 99 rows containing non-finite values (stat_bin).
## Warning: Removed 18869 rows containing missing values (geom_bar).

Using Zipf’s law and plotting log-log plots of word frequencies and rank for the year 2021

df5<-df_2021%>%
 unnest_tokens(word,tweet)%>%
  count(word, sort = TRUE)%>%
  anti_join(stop_words,by="word")

df5 <- df5 %>% filter(!word %in% irr)
df5$total<-sum(df5$n)
frequency_by_rank_df5 <- df5%>%
  mutate(rank=row_number(),`term frequency`= n/total)
frequency_by_rank_df5 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset_2021 <- frequency_by_rank_df5 %>% 
  filter(rank < 2000,
         rank > 0)
frequency_by_rank_df5 %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_abline(intercept = -1.5015, slope = -0.7532, 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

Bigram Network grapths for the year 2021

E_2021_bigrams <-df_2021 %>%
  unnest_tokens(bigram, tweet, token = "ngrams", n = 2)
E_2021_bigrams<-E_2021_bigrams %>%
  count(bigram,sort=TRUE)
separated_bigrams_2021 <-E_2021_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
filtered_bigrams_2021 <- separated_bigrams_2021 %>%
  filter(!word1 %in% c(stop_words$word,irr)) %>%
  filter(!word2 %in% c(stop_words$word,irr))
bigram_graph_2021<- filtered_bigrams_2021 %>%
    filter(n>15)%>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
set.seed(2021)

ggraph(bigram_graph_2021, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

Visualizing edges with weight > 15 for better visualization

set.seed(2021)

df_e <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph_2021, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = df_a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Part 3 and 4 in one plot

tw_2017_df <- data.frame(tweet=df_2017$tweet, year=2017)
tw_2018_df <- data.frame(tweet=df_2018$tweet, year=2018)
tw_2019_df <- data.frame(tweet=df_2019$tweet, year=2019)
tw_2020_df <- data.frame(tweet=df_2020$tweet, year=2020)
tw_2021_df <- data.frame(tweet=df_2021$tweet, year=2021)

tw_df <- rbind(tw_2017_df, tw_2018_df, tw_2019_df, tw_2020_df, tw_2021_df)
tw_words <- tw_df %>% unnest_tokens(word, tweet)
tw_filtred <- tw_words %>% filter(!word %in% c(stop_words$word,irr))
tw_count <- tw_filtred %>% count(year, word, sort = TRUE)

total_words <- tw_count %>% 
  group_by(year) %>% 
  summarize(total = sum(n))

tw_count <- left_join(tw_count, total_words)
## Joining, by = "year"
tw_count <- tw_count %>% mutate(term_frequency=n/total)
ggplot(tw_count, aes(term_frequency, fill=year)) +
  geom_histogram(show.legend = F) +
  xlim(NA, 0.0009) +
  labs(x="n/total", title="world frequencies by percentage of total words(> 0.09 percentage excluded") +
  facet_wrap(~year, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 536 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing missing values (geom_bar).

fr_rank_all <- tw_count %>% group_by(year) %>% mutate(rank = row_number() , frequency = n/sum(n))

fr_rank_all  %>%
  ggplot(aes(x=rank, y=frequency, color=as.character(year))) +
  geom_line(show.legend = T) +
  scale_x_log10() +
  scale_y_log10() +
  labs(title = "world frequency by rank for all the years", color="year")